library(foreign)
library(plyr)
library(xtable)
library(arm)	
library(Amelia)
library(MASS)	
library(car)
library(gmodels)
library(reporttools)
library(sandwich)
library(lmtest)

all.appeals <- read.dta('Israel district appeals.dta')
#set 99s to NA and drop defendants with missing race and cases with missing mixed/Jewish status
ninety.nines <- all.appeals == 99
all.appeals[ninety.nines] <- NA


############## drop irrelevant fields
leave.out <- c(
"jarab1",              "jfem1",               "jreligiousity1",      "jarab2"      ,       
 "jfem2"          ,     "jreligiousity2"  ,    "jarab3"    ,          "jfem3"    ,          
  "jreligiousity3"    , 
  #        "arabs_J"       ,    
 "prosecutor_arab" ,    "prosecutor_female"  , "defender_public"   , 
 "defender_arab"    ,   "defender_female" ,    "accused_s_num"     , 
 "accused_israeli"   ,  "magistrate_arab"   ,  "magistrate_female"  ,
 "PublicOrder"  ,       "Property"   ,         "BodilyHarm"    ,     
 "Morality"      ,      "TrafficLicencing"   , "Fraud"        ,      
 "Regulatory"   ,       "Fiscal"   ,           "EconomicBusiness",   
 "security"      ,      "deferred"    ,         
 "detained"      ,      "Report"        ,         "Prison_term" , "Com_term"    ,
 "Fine"   ,      "Compensation" ,"prison_reduced"  ,   
 "prison_increased" ,   "Unanimous"     ,      "judge2_majority"  ,  
 "judge2_ruling" ,      "judge3_majority"  ,   "distc1"      ,       
"distc2"         ,     "distc3"          ,    "avg_jage"    ,       
"avg_jexp"        ,    "methodPlea"  ,                 "anyvictim"   ,       
 "victimRace1"    ,       "victimRace3"        ,
 "victimRace4"   ,      "prequest1"  ,         "prequest3"    ,       "prequest4"    ,       "appellant3"     ,           
'encoder','id','year','date', 'date2','accused_name','judge1','judge2','judge3','jYOB1','jYOB2','jYOB3','jYON_M1','jYON_M2','jYON_M3','jYON_D1','jYON_D2','jYON_D3','accused_Num','accused_nationality','Magistrate_date','Magistrate','location','method','Offences_new','security_type','activating_defered_M','Report_positive','Prosecutorial_request','victim','Appellant')

leave.in <- all.appeals[,-match(leave.out, colnames(all.appeals))]

####### transform factors to be numeric

leave.in$lenient <- leave.in$verdict == 'More lenient' | is.na(leave.in$verdict) #one NA and it should be more lenient
leave.in$harsher <- leave.in$verdict == 'Harsher'

leave.in$court_Nazareth <- leave.in$court == 'Nazareth'
leave.in$court_TLV <- leave.in$court == 'TLV'
leave.in$court_Jerusalem <- leave.in$court == 'Jerusalem'

judge.codes <- cbind(all.appeals$jcode1, all.appeals$jcode2, all.appeals$jcode3)
judge.codes[is.na(judge.codes)] <- 0 #get rid of NAs
#sort the codes 
judge.codes <- t(apply(judge.codes, 1, sort))
leave.in$uniq.panel <- as.factor(paste(judge.codes[,1], judge.codes[,2], judge.codes[,3], sep = '.'))

#drop factor columns now that we have expanded them into dummies
leave.in <- leave.in[,-which(colnames(leave.in) %in% c('court','verdict', 'jcode1', 'jcode2', 'jcode3'))]

all.appeals <- leave.in


######### drop observations with NAs in key accused_arab and narabs_J columns

all.appeals <- all.appeals[-which(is.na(all.appeals$accused_arab)),]
if(sum(is.na(all.appeals$narabs_J)) > 0){
	all.appeals <- all.appeals[-which(is.na(all.appeals$narabs_J)),]
}

age.idx <- grep('jage', colnames(all.appeals))
exp.idx <- grep('jexp', colnames(all.appeals))
all.appeals$avg_jage <- rowMeans(all.appeals[,age.idx],na.rm = T)
all.appeals$avg_jexp <- rowMeans(all.appeals[,exp.idx], na.rm = T)
all.appeals <- all.appeals[,-c(age.idx, exp.idx)]



##### handle missingness

#response variables
my.resp.vars <- c('lenient','harsher','incarceration','Prison_term_D')

#get NA counts for covariates and response variables
na.counts.cov <- apply(all.appeals, 1, function(x) sum(is.na(x)))
na.counts.resp <- apply(all.appeals[my.resp.vars],1, function(x) sum(is.na(x)))

#get rid of all rows with an important variable missing or several less important missing
bad.apples <- which(na.counts.cov > 2 | na.counts.resp > 0)
all.appeals <- all.appeals[-bad.apples,]


##################### now impute the rest of the covariates ##################### 

#drop two irrelevant columns
if('arabs_J' %in% colnames(all.appeals)) all.appeals <- all.appeals[,-which(colnames(all.appeals) == 'arabs_J')]
all.appeals <- all.appeals[,-which(colnames(all.appeals) == 'num')]
appeals <- all.appeals
panel.v <- c('uniq.panel')

#use Amelia package to impute 5 different versions of data
#set random seed to ensure reproducibility of imputation via EM
set.seed(2768830)
a.out <- amelia(appeals, idvars = c(panel.v,'court_Jerusalem'), p2s = 0);
n.impute <- 5

save(a.out,appeals, n.impute, file = 'imputed_data.Rdata')
